Please see code blocks below which have been used to load, prep, transform and plot our data.
This will serve as our initial pre-processing, which we can further analyze for our modelling phase and our paper.
Further analysis will also be performed as we dive deeper into our paper and final code.
load17.df <- read_excel('Native_Load_2017.xlsx')
load18.df <- read_excel('Native_Load_2018.xlsx')
load19.df <- read_excel('Native_Load_2019.xlsx')
load20.df <- read_excel('Native_Load_2020.xlsx')
load21.df <- read_excel('Native_Load_2021.xlsx')
## The load dataset for 2017 had a space in the Timestamp column, preventing the binding of the four years
colnames(load17.df)[1] <- "HourEnding"
## Using bind_rows from tidyverse, I merged 2017-2020
load_4_year <- bind_rows(load17.df,load18.df,load19.df,load20.df)
sum(is.na(load_4_year))
## [1] 0
## Using strptime, I converted the time column to a date variable
load_4_year$HourEnding <- as.POSIXct(strptime(load_4_year$HourEnding, format="%m/%d/%Y %H:%M"))
#load_4_year$HourEnding <- as.Date(load_4_year$HourEnding, format="%m/%d/%Y %H:%M") ## removed this to retain hourly info for plot
class(load_4_year$HourEnding)
## [1] "POSIXct" "POSIXt"
head(load_4_year)
## # A tibble: 6 x 10
## HourEnding COAST EAST FWEST NORTH NCENT SOUTH SCENT WEST ERCOT
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2017-01-01 01:00:00 8792. 897. 1998. 684. 9239. 2367. 4491. 954. 29421.
## 2 2017-01-01 02:00:00 8570. 866. 1998. 678. 9105. 2333. 4371. 951. 28871.
## 3 2017-01-01 03:00:00 8326. 839. 1994. 672. 8988. 2238. 4211. 944. 28212.
## 4 2017-01-01 04:00:00 8137. 823. 1996. 675. 8979. 2178. 4089. 943. 27820.
## 5 2017-01-01 05:00:00 8012. 814. 1995. 664. 9034. 2134. 4022. 955. 27629.
## 6 2017-01-01 06:00:00 7978. 823. 1981. 670. 9195. 2122. 4045. 973. 27788.
sum(is.na(load_4_year))
## [1] 4
#Remove NA's which arise due to issues with daylight savings (1 record in each year)
load_4_year <- na.omit(load_4_year)
## Converting to a time series
ts_load <- xts(load_4_year, load_4_year$HourEnding)
ts_load <- ts_load[,-1]
head(ts_load)
## COAST EAST FWEST NORTH NCENT
## 2017-01-01 01:00:00 " 8791.790" " 896.7463" "1997.718" " 683.6220" " 9239.153"
## 2017-01-01 02:00:00 " 8569.708" " 865.9306" "1997.781" " 677.9694" " 9104.997"
## 2017-01-01 03:00:00 " 8326.426" " 839.0512" "1993.699" " 671.9989" " 8988.035"
## 2017-01-01 04:00:00 " 8137.497" " 822.8293" "1995.541" " 675.2680" " 8979.148"
## 2017-01-01 05:00:00 " 8011.870" " 814.0162" "1995.254" " 663.6199" " 9033.548"
## 2017-01-01 06:00:00 " 7978.100" " 823.4194" "1981.352" " 669.7480" " 9195.158"
## SOUTH SCENT WEST ERCOT
## 2017-01-01 01:00:00 "2366.633" " 4490.781" " 954.1929" "29420.64"
## 2017-01-01 02:00:00 "2332.745" " 4370.657" " 951.0252" "28870.81"
## 2017-01-01 03:00:00 "2237.506" " 4210.650" " 944.3577" "28211.72"
## 2017-01-01 04:00:00 "2178.102" " 4088.713" " 943.1887" "27820.29"
## 2017-01-01 05:00:00 "2133.954" " 4021.757" " 954.9379" "27628.96"
## 2017-01-01 06:00:00 "2122.017" " 4045.300" " 972.5624" "27787.66"
## A function has been created to plot data once appropriate transformations have been made.
plot_data = function(df, title){
plot_melt = melt(df, na.rm = FALSE, value.name = 'Load', id = 'HourEnding')
names(plot_melt)[2] <- "Region"
plt = ggplot(data = plot_melt, mapping = aes(x=HourEnding, y=Load, colour = Region)) +
geom_line(size = 0.1, alpha=0.4) +
ggdark::dark_mode() +
theme(legend.position = 'bottom', ) +
ggtitle(paste('ERCOT Actual Power Load by Region - ' , title)) +
theme(plot.title = element_text(hjust = 0.5)) +
ylab("Electrical Load (MW)") +
xlab("Hour Ending") +
theme(panel.grid.major = element_blank()) +
guides(shape = guide_legend(override.aes = list(size = 1)),
color = guide_legend(override.aes = list(size = 1))) +
theme(legend.title = element_text(size = 8),
legend.text = element_text(size = 8),
legend.key.size = unit(1, "lines"))
return(plt)}
plot_data(load_4_year, 'Raw Data')
std = load_4_year %>%
mutate_if(is.numeric, scale)
plot_data(std, 'Standardized')
### Applying differencing and detrending to remove seasonality and make data stationary:
## A differenced dataframe is created
dload <- data.frame(lapply(load_4_year[,-1], diff))
dload["HourEnding"] <- load_4_year$HourEnding[2:35060]
plot_data(dload, 'Differenced')
## Building out the min/max scaler to normalize the data as a function
normalize <- function(x) {
(x-min(x))/(max(x)-min(x))
}
## Removing the POSIXlt column to normalize
loadnorm <- (load_4_year[,-1])
## Running the new DF through the normalization function
loadnorm <- normalize(loadnorm)
## Adding the POSIXlt column back on
loadnorm["HourEnding"] <- load_4_year$HourEnding
## Plotting the data
plot_data(loadnorm, "Normalized (Min/Max)")
It is clear that a min max normalization scalar is not a usable option as it does not account for the mean. The graph is still visually the same.